static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff}; static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff}; static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff}; static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff}; static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f}; static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f}; static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f}; static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f}; static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07}; static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03}; static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01}; void ANT_compress_qmx_v3::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers) { __m128i mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1; uint8_t *in = (uint8_t *)source; uint8_t *keys = ((uint8_t *)source) + len - 1; mask_21 = _mm_loadu_si128((__m128i *)static_mask_21); mask_12 = _mm_loadu_si128((__m128i *)static_mask_12); mask_10 = _mm_loadu_si128((__m128i *)static_mask_10); mask_9 = _mm_loadu_si128((__m128i *)static_mask_9); mask_7 = _mm_loadu_si128((__m128i *)static_mask_7); mask_6 = _mm_loadu_si128((__m128i *)static_mask_6); mask_5 = _mm_loadu_si128((__m128i *)static_mask_5); mask_4 = _mm_loadu_si128((__m128i *)static_mask_4); mask_3 = _mm_loadu_si128((__m128i *)static_mask_3); mask_2 = _mm_loadu_si128((__m128i *)static_mask_2); mask_1 = _mm_loadu_si128((__m128i *)static_mask_1); while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers { switch (*keys--) { case 0x00: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); _mm_storeu_si128((__m128i *)to + 448, tmp); _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); _mm_storeu_si128((__m128i *)to + 512, tmp); _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); _mm_storeu_si128((__m128i *)to + 576, tmp); _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); _mm_storeu_si128((__m128i *)to + 640, tmp); _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); _mm_storeu_si128((__m128i *)to + 704, tmp); _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); _mm_storeu_si128((__m128i *)to + 768, tmp); _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); _mm_storeu_si128((__m128i *)to + 832, tmp); _mm_storeu_si128((__m128i *)to + 832 + 1, tmp); _mm_storeu_si128((__m128i *)to + 832 + 2, tmp); _mm_storeu_si128((__m128i *)to + 832 + 3, tmp); _mm_storeu_si128((__m128i *)to + 832 + 4, tmp); _mm_storeu_si128((__m128i *)to + 832 + 5, tmp); _mm_storeu_si128((__m128i *)to + 832 + 6, tmp); _mm_storeu_si128((__m128i *)to + 832 + 7, tmp); _mm_storeu_si128((__m128i *)to + 832 + 8, tmp); _mm_storeu_si128((__m128i *)to + 832 + 9, tmp); _mm_storeu_si128((__m128i *)to + 832 + 10, tmp); _mm_storeu_si128((__m128i *)to + 832 + 11, tmp); _mm_storeu_si128((__m128i *)to + 832 + 12, tmp); _mm_storeu_si128((__m128i *)to + 832 + 13, tmp); _mm_storeu_si128((__m128i *)to + 832 + 14, tmp); _mm_storeu_si128((__m128i *)to + 832 + 15, tmp); _mm_storeu_si128((__m128i *)to + 832 + 16, tmp); _mm_storeu_si128((__m128i *)to + 832 + 17, tmp); _mm_storeu_si128((__m128i *)to + 832 + 18, tmp); _mm_storeu_si128((__m128i *)to + 832 + 19, tmp); _mm_storeu_si128((__m128i *)to + 832 + 20, tmp); _mm_storeu_si128((__m128i *)to + 832 + 21, tmp); _mm_storeu_si128((__m128i *)to + 832 + 22, tmp); _mm_storeu_si128((__m128i *)to + 832 + 23, tmp); _mm_storeu_si128((__m128i *)to + 832 + 24, tmp); _mm_storeu_si128((__m128i *)to + 832 + 25, tmp); _mm_storeu_si128((__m128i *)to + 832 + 26, tmp); _mm_storeu_si128((__m128i *)to + 832 + 27, tmp); _mm_storeu_si128((__m128i *)to + 832 + 28, tmp); _mm_storeu_si128((__m128i *)to + 832 + 29, tmp); _mm_storeu_si128((__m128i *)to + 832 + 30, tmp); _mm_storeu_si128((__m128i *)to + 832 + 31, tmp); _mm_storeu_si128((__m128i *)to + 832 + 32, tmp); _mm_storeu_si128((__m128i *)to + 832 + 33, tmp); _mm_storeu_si128((__m128i *)to + 832 + 34, tmp); _mm_storeu_si128((__m128i *)to + 832 + 35, tmp); _mm_storeu_si128((__m128i *)to + 832 + 36, tmp); _mm_storeu_si128((__m128i *)to + 832 + 37, tmp); _mm_storeu_si128((__m128i *)to + 832 + 38, tmp); _mm_storeu_si128((__m128i *)to + 832 + 39, tmp); _mm_storeu_si128((__m128i *)to + 832 + 40, tmp); _mm_storeu_si128((__m128i *)to + 832 + 41, tmp); _mm_storeu_si128((__m128i *)to + 832 + 42, tmp); _mm_storeu_si128((__m128i *)to + 832 + 43, tmp); _mm_storeu_si128((__m128i *)to + 832 + 44, tmp); _mm_storeu_si128((__m128i *)to + 832 + 45, tmp); _mm_storeu_si128((__m128i *)to + 832 + 46, tmp); _mm_storeu_si128((__m128i *)to + 832 + 47, tmp); _mm_storeu_si128((__m128i *)to + 832 + 48, tmp); _mm_storeu_si128((__m128i *)to + 832 + 49, tmp); _mm_storeu_si128((__m128i *)to + 832 + 50, tmp); _mm_storeu_si128((__m128i *)to + 832 + 51, tmp); _mm_storeu_si128((__m128i *)to + 832 + 52, tmp); _mm_storeu_si128((__m128i *)to + 832 + 53, tmp); _mm_storeu_si128((__m128i *)to + 832 + 54, tmp); _mm_storeu_si128((__m128i *)to + 832 + 55, tmp); _mm_storeu_si128((__m128i *)to + 832 + 56, tmp); _mm_storeu_si128((__m128i *)to + 832 + 57, tmp); _mm_storeu_si128((__m128i *)to + 832 + 58, tmp); _mm_storeu_si128((__m128i *)to + 832 + 59, tmp); _mm_storeu_si128((__m128i *)to + 832 + 60, tmp); _mm_storeu_si128((__m128i *)to + 832 + 61, tmp); _mm_storeu_si128((__m128i *)to + 832 + 62, tmp); _mm_storeu_si128((__m128i *)to + 832 + 63, tmp); _mm_storeu_si128((__m128i *)to + 896, tmp); _mm_storeu_si128((__m128i *)to + 896 + 1, tmp); _mm_storeu_si128((__m128i *)to + 896 + 2, tmp); _mm_storeu_si128((__m128i *)to + 896 + 3, tmp); _mm_storeu_si128((__m128i *)to + 896 + 4, tmp); _mm_storeu_si128((__m128i *)to + 896 + 5, tmp); _mm_storeu_si128((__m128i *)to + 896 + 6, tmp); _mm_storeu_si128((__m128i *)to + 896 + 7, tmp); _mm_storeu_si128((__m128i *)to + 896 + 8, tmp); _mm_storeu_si128((__m128i *)to + 896 + 9, tmp); _mm_storeu_si128((__m128i *)to + 896 + 10, tmp); _mm_storeu_si128((__m128i *)to + 896 + 11, tmp); _mm_storeu_si128((__m128i *)to + 896 + 12, tmp); _mm_storeu_si128((__m128i *)to + 896 + 13, tmp); _mm_storeu_si128((__m128i *)to + 896 + 14, tmp); _mm_storeu_si128((__m128i *)to + 896 + 15, tmp); _mm_storeu_si128((__m128i *)to + 896 + 16, tmp); _mm_storeu_si128((__m128i *)to + 896 + 17, tmp); _mm_storeu_si128((__m128i *)to + 896 + 18, tmp); _mm_storeu_si128((__m128i *)to + 896 + 19, tmp); _mm_storeu_si128((__m128i *)to + 896 + 20, tmp); _mm_storeu_si128((__m128i *)to + 896 + 21, tmp); _mm_storeu_si128((__m128i *)to + 896 + 22, tmp); _mm_storeu_si128((__m128i *)to + 896 + 23, tmp); _mm_storeu_si128((__m128i *)to + 896 + 24, tmp); _mm_storeu_si128((__m128i *)to + 896 + 25, tmp); _mm_storeu_si128((__m128i *)to + 896 + 26, tmp); _mm_storeu_si128((__m128i *)to + 896 + 27, tmp); _mm_storeu_si128((__m128i *)to + 896 + 28, tmp); _mm_storeu_si128((__m128i *)to + 896 + 29, tmp); _mm_storeu_si128((__m128i *)to + 896 + 30, tmp); _mm_storeu_si128((__m128i *)to + 896 + 31, tmp); _mm_storeu_si128((__m128i *)to + 896 + 32, tmp); _mm_storeu_si128((__m128i *)to + 896 + 33, tmp); _mm_storeu_si128((__m128i *)to + 896 + 34, tmp); _mm_storeu_si128((__m128i *)to + 896 + 35, tmp); _mm_storeu_si128((__m128i *)to + 896 + 36, tmp); _mm_storeu_si128((__m128i *)to + 896 + 37, tmp); _mm_storeu_si128((__m128i *)to + 896 + 38, tmp); _mm_storeu_si128((__m128i *)to + 896 + 39, tmp); _mm_storeu_si128((__m128i *)to + 896 + 40, tmp); _mm_storeu_si128((__m128i *)to + 896 + 41, tmp); _mm_storeu_si128((__m128i *)to + 896 + 42, tmp); _mm_storeu_si128((__m128i *)to + 896 + 43, tmp); _mm_storeu_si128((__m128i *)to + 896 + 44, tmp); _mm_storeu_si128((__m128i *)to + 896 + 45, tmp); _mm_storeu_si128((__m128i *)to + 896 + 46, tmp); _mm_storeu_si128((__m128i *)to + 896 + 47, tmp); _mm_storeu_si128((__m128i *)to + 896 + 48, tmp); _mm_storeu_si128((__m128i *)to + 896 + 49, tmp); _mm_storeu_si128((__m128i *)to + 896 + 50, tmp); _mm_storeu_si128((__m128i *)to + 896 + 51, tmp); _mm_storeu_si128((__m128i *)to + 896 + 52, tmp); _mm_storeu_si128((__m128i *)to + 896 + 53, tmp); _mm_storeu_si128((__m128i *)to + 896 + 54, tmp); _mm_storeu_si128((__m128i *)to + 896 + 55, tmp); _mm_storeu_si128((__m128i *)to + 896 + 56, tmp); _mm_storeu_si128((__m128i *)to + 896 + 57, tmp); _mm_storeu_si128((__m128i *)to + 896 + 58, tmp); _mm_storeu_si128((__m128i *)to + 896 + 59, tmp); _mm_storeu_si128((__m128i *)to + 896 + 60, tmp); _mm_storeu_si128((__m128i *)to + 896 + 61, tmp); _mm_storeu_si128((__m128i *)to + 896 + 62, tmp); _mm_storeu_si128((__m128i *)to + 896 + 63, tmp); _mm_storeu_si128((__m128i *)to + 960, tmp); _mm_storeu_si128((__m128i *)to + 960 + 1, tmp); _mm_storeu_si128((__m128i *)to + 960 + 2, tmp); _mm_storeu_si128((__m128i *)to + 960 + 3, tmp); _mm_storeu_si128((__m128i *)to + 960 + 4, tmp); _mm_storeu_si128((__m128i *)to + 960 + 5, tmp); _mm_storeu_si128((__m128i *)to + 960 + 6, tmp); _mm_storeu_si128((__m128i *)to + 960 + 7, tmp); _mm_storeu_si128((__m128i *)to + 960 + 8, tmp); _mm_storeu_si128((__m128i *)to + 960 + 9, tmp); _mm_storeu_si128((__m128i *)to + 960 + 10, tmp); _mm_storeu_si128((__m128i *)to + 960 + 11, tmp); _mm_storeu_si128((__m128i *)to + 960 + 12, tmp); _mm_storeu_si128((__m128i *)to + 960 + 13, tmp); _mm_storeu_si128((__m128i *)to + 960 + 14, tmp); _mm_storeu_si128((__m128i *)to + 960 + 15, tmp); _mm_storeu_si128((__m128i *)to + 960 + 16, tmp); _mm_storeu_si128((__m128i *)to + 960 + 17, tmp); _mm_storeu_si128((__m128i *)to + 960 + 18, tmp); _mm_storeu_si128((__m128i *)to + 960 + 19, tmp); _mm_storeu_si128((__m128i *)to + 960 + 20, tmp); _mm_storeu_si128((__m128i *)to + 960 + 21, tmp); _mm_storeu_si128((__m128i *)to + 960 + 22, tmp); _mm_storeu_si128((__m128i *)to + 960 + 23, tmp); _mm_storeu_si128((__m128i *)to + 960 + 24, tmp); _mm_storeu_si128((__m128i *)to + 960 + 25, tmp); _mm_storeu_si128((__m128i *)to + 960 + 26, tmp); _mm_storeu_si128((__m128i *)to + 960 + 27, tmp); _mm_storeu_si128((__m128i *)to + 960 + 28, tmp); _mm_storeu_si128((__m128i *)to + 960 + 29, tmp); _mm_storeu_si128((__m128i *)to + 960 + 30, tmp); _mm_storeu_si128((__m128i *)to + 960 + 31, tmp); _mm_storeu_si128((__m128i *)to + 960 + 32, tmp); _mm_storeu_si128((__m128i *)to + 960 + 33, tmp); _mm_storeu_si128((__m128i *)to + 960 + 34, tmp); _mm_storeu_si128((__m128i *)to + 960 + 35, tmp); _mm_storeu_si128((__m128i *)to + 960 + 36, tmp); _mm_storeu_si128((__m128i *)to + 960 + 37, tmp); _mm_storeu_si128((__m128i *)to + 960 + 38, tmp); _mm_storeu_si128((__m128i *)to + 960 + 39, tmp); _mm_storeu_si128((__m128i *)to + 960 + 40, tmp); _mm_storeu_si128((__m128i *)to + 960 + 41, tmp); _mm_storeu_si128((__m128i *)to + 960 + 42, tmp); _mm_storeu_si128((__m128i *)to + 960 + 43, tmp); _mm_storeu_si128((__m128i *)to + 960 + 44, tmp); _mm_storeu_si128((__m128i *)to + 960 + 45, tmp); _mm_storeu_si128((__m128i *)to + 960 + 46, tmp); _mm_storeu_si128((__m128i *)to + 960 + 47, tmp); _mm_storeu_si128((__m128i *)to + 960 + 48, tmp); _mm_storeu_si128((__m128i *)to + 960 + 49, tmp); _mm_storeu_si128((__m128i *)to + 960 + 50, tmp); _mm_storeu_si128((__m128i *)to + 960 + 51, tmp); _mm_storeu_si128((__m128i *)to + 960 + 52, tmp); _mm_storeu_si128((__m128i *)to + 960 + 53, tmp); _mm_storeu_si128((__m128i *)to + 960 + 54, tmp); _mm_storeu_si128((__m128i *)to + 960 + 55, tmp); _mm_storeu_si128((__m128i *)to + 960 + 56, tmp); _mm_storeu_si128((__m128i *)to + 960 + 57, tmp); _mm_storeu_si128((__m128i *)to + 960 + 58, tmp); _mm_storeu_si128((__m128i *)to + 960 + 59, tmp); _mm_storeu_si128((__m128i *)to + 960 + 60, tmp); _mm_storeu_si128((__m128i *)to + 960 + 61, tmp); _mm_storeu_si128((__m128i *)to + 960 + 62, tmp); _mm_storeu_si128((__m128i *)to + 960 + 63, tmp); to += 4096; break; } case 0x01: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); _mm_storeu_si128((__m128i *)to + 448, tmp); _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); _mm_storeu_si128((__m128i *)to + 512, tmp); _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); _mm_storeu_si128((__m128i *)to + 576, tmp); _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); _mm_storeu_si128((__m128i *)to + 640, tmp); _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); _mm_storeu_si128((__m128i *)to + 704, tmp); _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); _mm_storeu_si128((__m128i *)to + 768, tmp); _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); _mm_storeu_si128((__m128i *)to + 832, tmp); _mm_storeu_si128((__m128i *)to + 832 + 1, tmp); _mm_storeu_si128((__m128i *)to + 832 + 2, tmp); _mm_storeu_si128((__m128i *)to + 832 + 3, tmp); _mm_storeu_si128((__m128i *)to + 832 + 4, tmp); _mm_storeu_si128((__m128i *)to + 832 + 5, tmp); _mm_storeu_si128((__m128i *)to + 832 + 6, tmp); _mm_storeu_si128((__m128i *)to + 832 + 7, tmp); _mm_storeu_si128((__m128i *)to + 832 + 8, tmp); _mm_storeu_si128((__m128i *)to + 832 + 9, tmp); _mm_storeu_si128((__m128i *)to + 832 + 10, tmp); _mm_storeu_si128((__m128i *)to + 832 + 11, tmp); _mm_storeu_si128((__m128i *)to + 832 + 12, tmp); _mm_storeu_si128((__m128i *)to + 832 + 13, tmp); _mm_storeu_si128((__m128i *)to + 832 + 14, tmp); _mm_storeu_si128((__m128i *)to + 832 + 15, tmp); _mm_storeu_si128((__m128i *)to + 832 + 16, tmp); _mm_storeu_si128((__m128i *)to + 832 + 17, tmp); _mm_storeu_si128((__m128i *)to + 832 + 18, tmp); _mm_storeu_si128((__m128i *)to + 832 + 19, tmp); _mm_storeu_si128((__m128i *)to + 832 + 20, tmp); _mm_storeu_si128((__m128i *)to + 832 + 21, tmp); _mm_storeu_si128((__m128i *)to + 832 + 22, tmp); _mm_storeu_si128((__m128i *)to + 832 + 23, tmp); _mm_storeu_si128((__m128i *)to + 832 + 24, tmp); _mm_storeu_si128((__m128i *)to + 832 + 25, tmp); _mm_storeu_si128((__m128i *)to + 832 + 26, tmp); _mm_storeu_si128((__m128i *)to + 832 + 27, tmp); _mm_storeu_si128((__m128i *)to + 832 + 28, tmp); _mm_storeu_si128((__m128i *)to + 832 + 29, tmp); _mm_storeu_si128((__m128i *)to + 832 + 30, tmp); _mm_storeu_si128((__m128i *)to + 832 + 31, tmp); _mm_storeu_si128((__m128i *)to + 832 + 32, tmp); _mm_storeu_si128((__m128i *)to + 832 + 33, tmp); _mm_storeu_si128((__m128i *)to + 832 + 34, tmp); _mm_storeu_si128((__m128i *)to + 832 + 35, tmp); _mm_storeu_si128((__m128i *)to + 832 + 36, tmp); _mm_storeu_si128((__m128i *)to + 832 + 37, tmp); _mm_storeu_si128((__m128i *)to + 832 + 38, tmp); _mm_storeu_si128((__m128i *)to + 832 + 39, tmp); _mm_storeu_si128((__m128i *)to + 832 + 40, tmp); _mm_storeu_si128((__m128i *)to + 832 + 41, tmp); _mm_storeu_si128((__m128i *)to + 832 + 42, tmp); _mm_storeu_si128((__m128i *)to + 832 + 43, tmp); _mm_storeu_si128((__m128i *)to + 832 + 44, tmp); _mm_storeu_si128((__m128i *)to + 832 + 45, tmp); _mm_storeu_si128((__m128i *)to + 832 + 46, tmp); _mm_storeu_si128((__m128i *)to + 832 + 47, tmp); _mm_storeu_si128((__m128i *)to + 832 + 48, tmp); _mm_storeu_si128((__m128i *)to + 832 + 49, tmp); _mm_storeu_si128((__m128i *)to + 832 + 50, tmp); _mm_storeu_si128((__m128i *)to + 832 + 51, tmp); _mm_storeu_si128((__m128i *)to + 832 + 52, tmp); _mm_storeu_si128((__m128i *)to + 832 + 53, tmp); _mm_storeu_si128((__m128i *)to + 832 + 54, tmp); _mm_storeu_si128((__m128i *)to + 832 + 55, tmp); _mm_storeu_si128((__m128i *)to + 832 + 56, tmp); _mm_storeu_si128((__m128i *)to + 832 + 57, tmp); _mm_storeu_si128((__m128i *)to + 832 + 58, tmp); _mm_storeu_si128((__m128i *)to + 832 + 59, tmp); _mm_storeu_si128((__m128i *)to + 832 + 60, tmp); _mm_storeu_si128((__m128i *)to + 832 + 61, tmp); _mm_storeu_si128((__m128i *)to + 832 + 62, tmp); _mm_storeu_si128((__m128i *)to + 832 + 63, tmp); _mm_storeu_si128((__m128i *)to + 896, tmp); _mm_storeu_si128((__m128i *)to + 896 + 1, tmp); _mm_storeu_si128((__m128i *)to + 896 + 2, tmp); _mm_storeu_si128((__m128i *)to + 896 + 3, tmp); _mm_storeu_si128((__m128i *)to + 896 + 4, tmp); _mm_storeu_si128((__m128i *)to + 896 + 5, tmp); _mm_storeu_si128((__m128i *)to + 896 + 6, tmp); _mm_storeu_si128((__m128i *)to + 896 + 7, tmp); _mm_storeu_si128((__m128i *)to + 896 + 8, tmp); _mm_storeu_si128((__m128i *)to + 896 + 9, tmp); _mm_storeu_si128((__m128i *)to + 896 + 10, tmp); _mm_storeu_si128((__m128i *)to + 896 + 11, tmp); _mm_storeu_si128((__m128i *)to + 896 + 12, tmp); _mm_storeu_si128((__m128i *)to + 896 + 13, tmp); _mm_storeu_si128((__m128i *)to + 896 + 14, tmp); _mm_storeu_si128((__m128i *)to + 896 + 15, tmp); _mm_storeu_si128((__m128i *)to + 896 + 16, tmp); _mm_storeu_si128((__m128i *)to + 896 + 17, tmp); _mm_storeu_si128((__m128i *)to + 896 + 18, tmp); _mm_storeu_si128((__m128i *)to + 896 + 19, tmp); _mm_storeu_si128((__m128i *)to + 896 + 20, tmp); _mm_storeu_si128((__m128i *)to + 896 + 21, tmp); _mm_storeu_si128((__m128i *)to + 896 + 22, tmp); _mm_storeu_si128((__m128i *)to + 896 + 23, tmp); _mm_storeu_si128((__m128i *)to + 896 + 24, tmp); _mm_storeu_si128((__m128i *)to + 896 + 25, tmp); _mm_storeu_si128((__m128i *)to + 896 + 26, tmp); _mm_storeu_si128((__m128i *)to + 896 + 27, tmp); _mm_storeu_si128((__m128i *)to + 896 + 28, tmp); _mm_storeu_si128((__m128i *)to + 896 + 29, tmp); _mm_storeu_si128((__m128i *)to + 896 + 30, tmp); _mm_storeu_si128((__m128i *)to + 896 + 31, tmp); _mm_storeu_si128((__m128i *)to + 896 + 32, tmp); _mm_storeu_si128((__m128i *)to + 896 + 33, tmp); _mm_storeu_si128((__m128i *)to + 896 + 34, tmp); _mm_storeu_si128((__m128i *)to + 896 + 35, tmp); _mm_storeu_si128((__m128i *)to + 896 + 36, tmp); _mm_storeu_si128((__m128i *)to + 896 + 37, tmp); _mm_storeu_si128((__m128i *)to + 896 + 38, tmp); _mm_storeu_si128((__m128i *)to + 896 + 39, tmp); _mm_storeu_si128((__m128i *)to + 896 + 40, tmp); _mm_storeu_si128((__m128i *)to + 896 + 41, tmp); _mm_storeu_si128((__m128i *)to + 896 + 42, tmp); _mm_storeu_si128((__m128i *)to + 896 + 43, tmp); _mm_storeu_si128((__m128i *)to + 896 + 44, tmp); _mm_storeu_si128((__m128i *)to + 896 + 45, tmp); _mm_storeu_si128((__m128i *)to + 896 + 46, tmp); _mm_storeu_si128((__m128i *)to + 896 + 47, tmp); _mm_storeu_si128((__m128i *)to + 896 + 48, tmp); _mm_storeu_si128((__m128i *)to + 896 + 49, tmp); _mm_storeu_si128((__m128i *)to + 896 + 50, tmp); _mm_storeu_si128((__m128i *)to + 896 + 51, tmp); _mm_storeu_si128((__m128i *)to + 896 + 52, tmp); _mm_storeu_si128((__m128i *)to + 896 + 53, tmp); _mm_storeu_si128((__m128i *)to + 896 + 54, tmp); _mm_storeu_si128((__m128i *)to + 896 + 55, tmp); _mm_storeu_si128((__m128i *)to + 896 + 56, tmp); _mm_storeu_si128((__m128i *)to + 896 + 57, tmp); _mm_storeu_si128((__m128i *)to + 896 + 58, tmp); _mm_storeu_si128((__m128i *)to + 896 + 59, tmp); _mm_storeu_si128((__m128i *)to + 896 + 60, tmp); _mm_storeu_si128((__m128i *)to + 896 + 61, tmp); _mm_storeu_si128((__m128i *)to + 896 + 62, tmp); _mm_storeu_si128((__m128i *)to + 896 + 63, tmp); to += 3840; break; } case 0x02: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); _mm_storeu_si128((__m128i *)to + 448, tmp); _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); _mm_storeu_si128((__m128i *)to + 512, tmp); _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); _mm_storeu_si128((__m128i *)to + 576, tmp); _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); _mm_storeu_si128((__m128i *)to + 640, tmp); _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); _mm_storeu_si128((__m128i *)to + 704, tmp); _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); _mm_storeu_si128((__m128i *)to + 768, tmp); _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); _mm_storeu_si128((__m128i *)to + 832, tmp); _mm_storeu_si128((__m128i *)to + 832 + 1, tmp); _mm_storeu_si128((__m128i *)to + 832 + 2, tmp); _mm_storeu_si128((__m128i *)to + 832 + 3, tmp); _mm_storeu_si128((__m128i *)to + 832 + 4, tmp); _mm_storeu_si128((__m128i *)to + 832 + 5, tmp); _mm_storeu_si128((__m128i *)to + 832 + 6, tmp); _mm_storeu_si128((__m128i *)to + 832 + 7, tmp); _mm_storeu_si128((__m128i *)to + 832 + 8, tmp); _mm_storeu_si128((__m128i *)to + 832 + 9, tmp); _mm_storeu_si128((__m128i *)to + 832 + 10, tmp); _mm_storeu_si128((__m128i *)to + 832 + 11, tmp); _mm_storeu_si128((__m128i *)to + 832 + 12, tmp); _mm_storeu_si128((__m128i *)to + 832 + 13, tmp); _mm_storeu_si128((__m128i *)to + 832 + 14, tmp); _mm_storeu_si128((__m128i *)to + 832 + 15, tmp); _mm_storeu_si128((__m128i *)to + 832 + 16, tmp); _mm_storeu_si128((__m128i *)to + 832 + 17, tmp); _mm_storeu_si128((__m128i *)to + 832 + 18, tmp); _mm_storeu_si128((__m128i *)to + 832 + 19, tmp); _mm_storeu_si128((__m128i *)to + 832 + 20, tmp); _mm_storeu_si128((__m128i *)to + 832 + 21, tmp); _mm_storeu_si128((__m128i *)to + 832 + 22, tmp); _mm_storeu_si128((__m128i *)to + 832 + 23, tmp); _mm_storeu_si128((__m128i *)to + 832 + 24, tmp); _mm_storeu_si128((__m128i *)to + 832 + 25, tmp); _mm_storeu_si128((__m128i *)to + 832 + 26, tmp); _mm_storeu_si128((__m128i *)to + 832 + 27, tmp); _mm_storeu_si128((__m128i *)to + 832 + 28, tmp); _mm_storeu_si128((__m128i *)to + 832 + 29, tmp); _mm_storeu_si128((__m128i *)to + 832 + 30, tmp); _mm_storeu_si128((__m128i *)to + 832 + 31, tmp); _mm_storeu_si128((__m128i *)to + 832 + 32, tmp); _mm_storeu_si128((__m128i *)to + 832 + 33, tmp); _mm_storeu_si128((__m128i *)to + 832 + 34, tmp); _mm_storeu_si128((__m128i *)to + 832 + 35, tmp); _mm_storeu_si128((__m128i *)to + 832 + 36, tmp); _mm_storeu_si128((__m128i *)to + 832 + 37, tmp); _mm_storeu_si128((__m128i *)to + 832 + 38, tmp); _mm_storeu_si128((__m128i *)to + 832 + 39, tmp); _mm_storeu_si128((__m128i *)to + 832 + 40, tmp); _mm_storeu_si128((__m128i *)to + 832 + 41, tmp); _mm_storeu_si128((__m128i *)to + 832 + 42, tmp); _mm_storeu_si128((__m128i *)to + 832 + 43, tmp); _mm_storeu_si128((__m128i *)to + 832 + 44, tmp); _mm_storeu_si128((__m128i *)to + 832 + 45, tmp); _mm_storeu_si128((__m128i *)to + 832 + 46, tmp); _mm_storeu_si128((__m128i *)to + 832 + 47, tmp); _mm_storeu_si128((__m128i *)to + 832 + 48, tmp); _mm_storeu_si128((__m128i *)to + 832 + 49, tmp); _mm_storeu_si128((__m128i *)to + 832 + 50, tmp); _mm_storeu_si128((__m128i *)to + 832 + 51, tmp); _mm_storeu_si128((__m128i *)to + 832 + 52, tmp); _mm_storeu_si128((__m128i *)to + 832 + 53, tmp); _mm_storeu_si128((__m128i *)to + 832 + 54, tmp); _mm_storeu_si128((__m128i *)to + 832 + 55, tmp); _mm_storeu_si128((__m128i *)to + 832 + 56, tmp); _mm_storeu_si128((__m128i *)to + 832 + 57, tmp); _mm_storeu_si128((__m128i *)to + 832 + 58, tmp); _mm_storeu_si128((__m128i *)to + 832 + 59, tmp); _mm_storeu_si128((__m128i *)to + 832 + 60, tmp); _mm_storeu_si128((__m128i *)to + 832 + 61, tmp); _mm_storeu_si128((__m128i *)to + 832 + 62, tmp); _mm_storeu_si128((__m128i *)to + 832 + 63, tmp); to += 3584; break; } case 0x03: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); _mm_storeu_si128((__m128i *)to + 448, tmp); _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); _mm_storeu_si128((__m128i *)to + 512, tmp); _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); _mm_storeu_si128((__m128i *)to + 576, tmp); _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); _mm_storeu_si128((__m128i *)to + 640, tmp); _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); _mm_storeu_si128((__m128i *)to + 704, tmp); _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); _mm_storeu_si128((__m128i *)to + 768, tmp); _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); to += 3328; break; } case 0x04: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); _mm_storeu_si128((__m128i *)to + 448, tmp); _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); _mm_storeu_si128((__m128i *)to + 512, tmp); _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); _mm_storeu_si128((__m128i *)to + 576, tmp); _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); _mm_storeu_si128((__m128i *)to + 640, tmp); _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); _mm_storeu_si128((__m128i *)to + 704, tmp); _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); to += 3072; break; } case 0x05: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); _mm_storeu_si128((__m128i *)to + 448, tmp); _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); _mm_storeu_si128((__m128i *)to + 512, tmp); _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); _mm_storeu_si128((__m128i *)to + 576, tmp); _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); _mm_storeu_si128((__m128i *)to + 640, tmp); _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); to += 2816; break; } case 0x06: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); _mm_storeu_si128((__m128i *)to + 448, tmp); _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); _mm_storeu_si128((__m128i *)to + 512, tmp); _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); _mm_storeu_si128((__m128i *)to + 576, tmp); _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); to += 2560; break; } case 0x07: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); _mm_storeu_si128((__m128i *)to + 448, tmp); _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); _mm_storeu_si128((__m128i *)to + 512, tmp); _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); to += 2304; break; } case 0x08: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); _mm_storeu_si128((__m128i *)to + 448, tmp); _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); to += 2048; break; } case 0x09: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); _mm_storeu_si128((__m128i *)to + 384, tmp); _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); to += 1792; break; } case 0x0a: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); _mm_storeu_si128((__m128i *)to + 320, tmp); _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); to += 1536; break; } case 0x0b: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); _mm_storeu_si128((__m128i *)to + 256, tmp); _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); to += 1280; break; } case 0x0c: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); _mm_storeu_si128((__m128i *)to + 192, tmp); _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); to += 1024; break; } case 0x0d: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); _mm_storeu_si128((__m128i *)to + 128, tmp); _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); to += 768; break; } case 0x0e: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); _mm_storeu_si128((__m128i *)to + 64, tmp); _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); to += 512; break; } case 0x0f: { #ifdef NO_ZEROS const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); #endif _mm_storeu_si128((__m128i *)to + 0, tmp); _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); to += 256; break; } case 0x10: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 480, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 480 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 256; to += 2048; break; } case 0x11: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 240; to += 1920; break; } case 0x12: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 224; to += 1792; break; } case 0x13: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 208; to += 1664; break; } case 0x14: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 192; to += 1536; break; } case 0x15: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 176; to += 1408; break; } case 0x16: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 160; to += 1280; break; } case 0x17: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 144; to += 1152; break; } case 0x18: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 128; to += 1024; break; } case 0x19: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 112; to += 896; break; } case 0x1a: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 96; to += 768; break; } case 0x1b: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 80; to += 640; break; } case 0x1c: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 64; to += 512; break; } case 0x1d: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 48; to += 384; break; } case 0x1e: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 32; to += 256; break; } case 0x1f: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); } in += 16; to += 128; break; } case 0x20: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 240, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 240 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 256; to += 1024; break; } case 0x21: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 240; to += 960; break; } case 0x22: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 224; to += 896; break; } case 0x23: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 208; to += 832; break; } case 0x24: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 192; to += 768; break; } case 0x25: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 176; to += 704; break; } case 0x26: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 160; to += 640; break; } case 0x27: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 144; to += 576; break; } case 0x28: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 128; to += 512; break; } case 0x29: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 112; to += 448; break; } case 0x2a: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 96; to += 384; break; } case 0x2b: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 80; to += 320; break; } case 0x2c: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 64; to += 256; break; } case 0x2d: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 48; to += 192; break; } case 0x2e: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 32; to += 128; break; } case 0x2f: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); } in += 16; to += 64; break; } case 0x30: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 150, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 150 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 150 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 150 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 150 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 150 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 150 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 150 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 150 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 150 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 256; to += 640; break; } case 0x31: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 240; to += 600; break; } case 0x32: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 224; to += 560; break; } case 0x33: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 208; to += 520; break; } case 0x34: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 192; to += 480; break; } case 0x35: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 176; to += 440; break; } case 0x36: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 160; to += 400; break; } case 0x37: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 144; to += 360; break; } case 0x38: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 128; to += 320; break; } case 0x39: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 112; to += 280; break; } case 0x3a: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 96; to += 240; break; } case 0x3b: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 80; to += 200; break; } case 0x3c: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 64; to += 160; break; } case 0x3d: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 48; to += 120; break; } case 0x3e: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 32; to += 80; break; } case 0x3f: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); } in += 16; to += 40; break; } case 0x40: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 256; to += 512; break; } case 0x41: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 240; to += 480; break; } case 0x42: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 224; to += 448; break; } case 0x43: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 208; to += 416; break; } case 0x44: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 192; to += 384; break; } case 0x45: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 176; to += 352; break; } case 0x46: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 160; to += 320; break; } case 0x47: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 144; to += 288; break; } case 0x48: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 128; to += 256; break; } case 0x49: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 112; to += 224; break; } case 0x4a: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 96; to += 192; break; } case 0x4b: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 80; to += 160; break; } case 0x4c: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 64; to += 128; break; } case 0x4d: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 48; to += 96; break; } case 0x4e: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 32; to += 64; break; } case 0x4f: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); } in += 16; to += 32; break; } case 0x50: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 256; to += 384; break; } case 0x51: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 240; to += 360; break; } case 0x52: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 224; to += 336; break; } case 0x53: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 208; to += 312; break; } case 0x54: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 192; to += 288; break; } case 0x55: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 176; to += 264; break; } case 0x56: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 160; to += 240; break; } case 0x57: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 144; to += 216; break; } case 0x58: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 128; to += 192; break; } case 0x59: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 112; to += 168; break; } case 0x5a: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 96; to += 144; break; } case 0x5b: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 80; to += 120; break; } case 0x5c: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 64; to += 96; break; } case 0x5d: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 48; to += 72; break; } case 0x5e: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 32; to += 48; break; } case 0x5f: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); } in += 16; to += 24; break; } case 0x60: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 256; to += 320; break; } case 0x61: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 240; to += 300; break; } case 0x62: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 224; to += 280; break; } case 0x63: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 208; to += 260; break; } case 0x64: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 192; to += 240; break; } case 0x65: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 176; to += 220; break; } case 0x66: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 160; to += 200; break; } case 0x67: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 144; to += 180; break; } case 0x68: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 128; to += 160; break; } case 0x69: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 112; to += 140; break; } case 0x6a: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 96; to += 120; break; } case 0x6b: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 80; to += 100; break; } case 0x6c: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 64; to += 80; break; } case 0x6d: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 48; to += 60; break; } case 0x6e: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 32; to += 40; break; } case 0x6f: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); } in += 16; to += 20; break; } case 0x70: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); _mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); _mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30); _mm_storeu_si128((__m128i *)to + 135, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 135 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 135 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 135 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31); _mm_storeu_si128((__m128i *)to + 135 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 135 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 135 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 135 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 135 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 512; to += 576; break; } case 0x71: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); _mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); _mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 480; to += 540; break; } case 0x72: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 448; to += 504; break; } case 0x73: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 416; to += 468; break; } case 0x74: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 384; to += 432; break; } case 0x75: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 352; to += 396; break; } case 0x76: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 320; to += 360; break; } case 0x77: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 288; to += 324; break; } case 0x78: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 256; to += 288; break; } case 0x79: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 224; to += 252; break; } case 0x7a: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 192; to += 216; break; } case 0x7b: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 160; to += 180; break; } case 0x7c: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 128; to += 144; break; } case 0x7d: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 96; to += 108; break; } case 0x7e: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 64; to += 72; break; } case 0x7f: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); } in += 32; to += 36; break; } case 0x80: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 60, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 256; to += 256; break; } case 0x81: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 240; to += 240; break; } case 0x82: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 224; to += 224; break; } case 0x83: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 208; to += 208; break; } case 0x84: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 192; to += 192; break; } case 0x85: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 176; to += 176; break; } case 0x86: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 160; to += 160; break; } case 0x87: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 144; to += 144; break; } case 0x88: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 128; to += 128; break; } case 0x89: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 112; to += 112; break; } case 0x8a: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 96; to += 96; break; } case 0x8b: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 80; to += 80; break; } case 0x8c: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 64; to += 64; break; } case 0x8d: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 48; to += 48; break; } case 0x8e: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 32; to += 32; break; } case 0x8f: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); } in += 16; to += 16; break; } case 0x90: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); _mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); _mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30); _mm_storeu_si128((__m128i *)to + 105, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 105 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 105 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31); _mm_storeu_si128((__m128i *)to + 105 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 105 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 105 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 105 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 512; to += 448; break; } case 0x91: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); _mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); _mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 480; to += 420; break; } case 0x92: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 448; to += 392; break; } case 0x93: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 416; to += 364; break; } case 0x94: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 384; to += 336; break; } case 0x95: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 352; to += 308; break; } case 0x96: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 320; to += 280; break; } case 0x97: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 288; to += 252; break; } case 0x98: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 256; to += 224; break; } case 0x99: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 224; to += 196; break; } case 0x9a: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 192; to += 168; break; } case 0x9b: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 160; to += 140; break; } case 0x9c: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 128; to += 112; break; } case 0x9d: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 96; to += 84; break; } case 0x9e: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 64; to += 56; break; } case 0x9f: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); } in += 32; to += 28; break; } case 0xa0: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 256; to += 192; break; } case 0xa1: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 240; to += 180; break; } case 0xa2: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 224; to += 168; break; } case 0xa3: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 208; to += 156; break; } case 0xa4: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 192; to += 144; break; } case 0xa5: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 176; to += 132; break; } case 0xa6: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 160; to += 120; break; } case 0xa7: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 144; to += 108; break; } case 0xa8: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 128; to += 96; break; } case 0xa9: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 112; to += 84; break; } case 0xaa: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 96; to += 72; break; } case 0xab: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 80; to += 60; break; } case 0xac: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 64; to += 48; break; } case 0xad: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 48; to += 36; break; } case 0xae: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 32; to += 24; break; } case 0xaf: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); } in += 16; to += 12; break; } case 0xb0: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30); _mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31); _mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 512; to += 320; break; } case 0xb1: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 480; to += 300; break; } case 0xb2: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 448; to += 280; break; } case 0xb3: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 416; to += 260; break; } case 0xb4: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 384; to += 240; break; } case 0xb5: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 352; to += 220; break; } case 0xb6: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 320; to += 200; break; } case 0xb7: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 288; to += 180; break; } case 0xb8: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 256; to += 160; break; } case 0xb9: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 224; to += 140; break; } case 0xba: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 192; to += 120; break; } case 0xbb: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 160; to += 100; break; } case 0xbc: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 128; to += 80; break; } case 0xbd: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 96; to += 60; break; } case 0xbe: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 64; to += 40; break; } case 0xbf: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); } in += 32; to += 20; break; } case 0xc0: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 30, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 256; to += 128; break; } case 0xc1: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 240; to += 120; break; } case 0xc2: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 224; to += 112; break; } case 0xc3: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 208; to += 104; break; } case 0xc4: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 192; to += 96; break; } case 0xc5: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 176; to += 88; break; } case 0xc6: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 160; to += 80; break; } case 0xc7: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 144; to += 72; break; } case 0xc8: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 128; to += 64; break; } case 0xc9: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 112; to += 56; break; } case 0xca: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 96; to += 48; break; } case 0xcb: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 80; to += 40; break; } case 0xcc: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 64; to += 32; break; } case 0xcd: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 48; to += 24; break; } case 0xce: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 32; to += 16; break; } case 0xcf: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); } in += 16; to += 8; break; } case 0xd0: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30); _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31); _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 512; to += 192; break; } case 0xd1: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 480; to += 180; break; } case 0xd2: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 448; to += 168; break; } case 0xd3: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 416; to += 156; break; } case 0xd4: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 384; to += 144; break; } case 0xd5: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 352; to += 132; break; } case 0xd6: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 320; to += 120; break; } case 0xd7: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 288; to += 108; break; } case 0xd8: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 256; to += 96; break; } case 0xd9: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 224; to += 84; break; } case 0xda: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 192; to += 72; break; } case 0xdb: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 160; to += 60; break; } case 0xdc: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 128; to += 48; break; } case 0xdd: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 96; to += 36; break; } case 0xde: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 64; to += 24; break; } case 0xdf: { { const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); } in += 32; to += 12; break; } case 0xe0: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 7, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 8, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 9, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 10, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 11, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 12, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 13, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 14, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 15); _mm_storeu_si128((__m128i *)to + 15, tmp); } in += 256; to += 64; break; } case 0xe1: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 7, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 8, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 9, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 10, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 11, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 12, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 13, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); _mm_storeu_si128((__m128i *)to + 14, tmp); } in += 240; to += 60; break; } case 0xe2: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 7, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 8, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 9, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 10, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 11, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 12, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); _mm_storeu_si128((__m128i *)to + 13, tmp); } in += 224; to += 56; break; } case 0xe3: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 7, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 8, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 9, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 10, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 11, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); _mm_storeu_si128((__m128i *)to + 12, tmp); } in += 208; to += 52; break; } case 0xe4: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 7, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 8, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 9, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 10, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); _mm_storeu_si128((__m128i *)to + 11, tmp); } in += 192; to += 48; break; } case 0xe5: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 7, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 8, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 9, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); _mm_storeu_si128((__m128i *)to + 10, tmp); } in += 176; to += 44; break; } case 0xe6: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 7, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 8, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); _mm_storeu_si128((__m128i *)to + 9, tmp); } in += 160; to += 40; break; } case 0xe7: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 7, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); _mm_storeu_si128((__m128i *)to + 8, tmp); } in += 144; to += 36; break; } case 0xe8: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); _mm_storeu_si128((__m128i *)to + 7, tmp); } in += 128; to += 32; break; } case 0xe9: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); _mm_storeu_si128((__m128i *)to + 6, tmp); } in += 112; to += 28; break; } case 0xea: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); _mm_storeu_si128((__m128i *)to + 5, tmp); } in += 96; to += 24; break; } case 0xeb: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); _mm_storeu_si128((__m128i *)to + 4, tmp); } in += 80; to += 20; break; } case 0xec: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); _mm_storeu_si128((__m128i *)to + 3, tmp); } in += 64; to += 16; break; } case 0xed: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); _mm_storeu_si128((__m128i *)to + 2, tmp); } in += 48; to += 12; break; } case 0xee: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, tmp); } in += 32; to += 8; break; } case 0xef: { { const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); _mm_storeu_si128((__m128i *)to + 0, tmp); } in += 16; to += 4; break; } case 0xf0: { *(to + 0) = *(uint8_t *)(in + 0); *(to + 1) = *(uint8_t *)(in + 1); *(to + 2) = *(uint8_t *)(in + 2); *(to + 3) = *(uint8_t *)(in + 3); in += 4; to += 4; break; } case 0xf1: { *(to + 0) = *(uint8_t *)(in + 0); *(to + 1) = *(uint8_t *)(in + 1); *(to + 2) = *(uint8_t *)(in + 2); in += 3; to += 3; break; } case 0xf2: { *(to + 0) = *(uint8_t *)(in + 0); *(to + 1) = *(uint8_t *)(in + 1); in += 2; to += 2; break; } case 0xf3: { *(to + 0) = *(uint8_t *)(in + 0); in += 1; to += 1; break; } case 0xf4: { *(to + 0) = *(uint16_t *)(in + 2 * 0); *(to + 1) = *(uint16_t *)(in + 2 * 1); *(to + 2) = *(uint16_t *)(in + 2 * 2); *(to + 3) = *(uint16_t *)(in + 2 * 3); in += 2 * 4; to += 4; break; } case 0xf5: { *(to + 0) = *(uint16_t *)(in + 2 * 0); *(to + 1) = *(uint16_t *)(in + 2 * 1); *(to + 2) = *(uint16_t *)(in + 2 * 2); in += 2 * 3; to += 3; break; } case 0xf6: { *(to + 0) = *(uint16_t *)(in + 2 * 0); *(to + 1) = *(uint16_t *)(in + 2 * 1); in += 2 * 2; to += 2; break; } case 0xf7: { *(to + 0) = *(uint16_t *)(in + 2 * 0); in += 2 * 1; to += 1; break; } case 0xf8: { *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); *(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2)); *(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2)); *(to + 3) = (*(uint8_t *)(in + 3 * 3) << 16) | (*(uint8_t *)(in + 3 * 3 + 1) << 8) | (*(uint8_t *)(in + 3 * 3 + 2)); in += 3 * 4; to += 4; break; } case 0xf9: { *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); *(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2)); *(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2)); in += 3 * 3; to += 3; break; } case 0xfa: { *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); *(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2)); in += 3 * 2; to += 2; break; } case 0xfb: { *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); in += 3 * 1; to += 1; break; } case 0xfc: { *(to + 0) = *(uint32_t *)(in + 4 * 0); *(to + 1) = *(uint32_t *)(in + 4 * 1); *(to + 2) = *(uint32_t *)(in + 4 * 2); *(to + 3) = *(uint32_t *)(in + 4 * 3); in += 4 * 4; to += 4; break; } case 0xfd: { *(to + 0) = *(uint32_t *)(in + 4 * 0); *(to + 1) = *(uint32_t *)(in + 4 * 1); *(to + 2) = *(uint32_t *)(in + 4 * 2); in += 4 * 3; to += 3; break; } case 0xfe: { *(to + 0) = *(uint32_t *)(in + 4 * 0); *(to + 1) = *(uint32_t *)(in + 4 * 1); in += 4 * 2; to += 2; break; } case 0xff: { *(to + 0) = *(uint32_t *)(in + 4 * 0); in += 4 * 1; to += 1; break; } } } }