static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff}; static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff}; static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff}; static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff}; static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f}; static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f}; static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f}; static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f}; static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07}; static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03}; static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01}; void ANT_compress_qmx_v2::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers) { __m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1; uint8_t *in = (uint8_t *)source; uint8_t *keys = ((uint8_t *)source) + len - 1; mask_21 = _mm_loadu_si128((__m128i *)static_mask_21); mask_12 = _mm_loadu_si128((__m128i *)static_mask_12); mask_10 = _mm_loadu_si128((__m128i *)static_mask_10); mask_9 = _mm_loadu_si128((__m128i *)static_mask_9); mask_7 = _mm_loadu_si128((__m128i *)static_mask_7); mask_6 = _mm_loadu_si128((__m128i *)static_mask_6); mask_5 = _mm_loadu_si128((__m128i *)static_mask_5); mask_4 = _mm_loadu_si128((__m128i *)static_mask_4); mask_3 = _mm_loadu_si128((__m128i *)static_mask_3); mask_2 = _mm_loadu_si128((__m128i *)static_mask_2); mask_1 = _mm_loadu_si128((__m128i *)static_mask_1); while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers { switch (*keys--) { case 0x00: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x01: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x02: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x03: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x04: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x05: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x06: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x07: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x08: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x09: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x0a: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x0b: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x0c: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x0d: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x0e: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; case 0x0f: #ifdef NO_ZEROS tmp = _mm_loadu_si128((__m128i *)static_mask_1); #else tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); #endif _mm_storeu_si128((__m128i *)to, tmp); _mm_storeu_si128((__m128i *)to + 1, tmp); _mm_storeu_si128((__m128i *)to + 2, tmp); _mm_storeu_si128((__m128i *)to + 3, tmp); _mm_storeu_si128((__m128i *)to + 4, tmp); _mm_storeu_si128((__m128i *)to + 5, tmp); _mm_storeu_si128((__m128i *)to + 6, tmp); _mm_storeu_si128((__m128i *)to + 7, tmp); _mm_storeu_si128((__m128i *)to + 8, tmp); _mm_storeu_si128((__m128i *)to + 9, tmp); _mm_storeu_si128((__m128i *)to + 10, tmp); _mm_storeu_si128((__m128i *)to + 11, tmp); _mm_storeu_si128((__m128i *)to + 12, tmp); _mm_storeu_si128((__m128i *)to + 13, tmp); _mm_storeu_si128((__m128i *)to + 14, tmp); _mm_storeu_si128((__m128i *)to + 15, tmp); _mm_storeu_si128((__m128i *)to + 16, tmp); _mm_storeu_si128((__m128i *)to + 17, tmp); _mm_storeu_si128((__m128i *)to + 18, tmp); _mm_storeu_si128((__m128i *)to + 19, tmp); _mm_storeu_si128((__m128i *)to + 20, tmp); _mm_storeu_si128((__m128i *)to + 21, tmp); _mm_storeu_si128((__m128i *)to + 22, tmp); _mm_storeu_si128((__m128i *)to + 23, tmp); _mm_storeu_si128((__m128i *)to + 24, tmp); _mm_storeu_si128((__m128i *)to + 25, tmp); _mm_storeu_si128((__m128i *)to + 26, tmp); _mm_storeu_si128((__m128i *)to + 27, tmp); _mm_storeu_si128((__m128i *)to + 28, tmp); _mm_storeu_si128((__m128i *)to + 29, tmp); _mm_storeu_si128((__m128i *)to + 30, tmp); _mm_storeu_si128((__m128i *)to + 31, tmp); _mm_storeu_si128((__m128i *)to + 32, tmp); _mm_storeu_si128((__m128i *)to + 33, tmp); _mm_storeu_si128((__m128i *)to + 34, tmp); _mm_storeu_si128((__m128i *)to + 35, tmp); _mm_storeu_si128((__m128i *)to + 36, tmp); _mm_storeu_si128((__m128i *)to + 37, tmp); _mm_storeu_si128((__m128i *)to + 38, tmp); _mm_storeu_si128((__m128i *)to + 39, tmp); _mm_storeu_si128((__m128i *)to + 40, tmp); _mm_storeu_si128((__m128i *)to + 41, tmp); _mm_storeu_si128((__m128i *)to + 42, tmp); _mm_storeu_si128((__m128i *)to + 43, tmp); _mm_storeu_si128((__m128i *)to + 44, tmp); _mm_storeu_si128((__m128i *)to + 45, tmp); _mm_storeu_si128((__m128i *)to + 46, tmp); _mm_storeu_si128((__m128i *)to + 47, tmp); _mm_storeu_si128((__m128i *)to + 48, tmp); _mm_storeu_si128((__m128i *)to + 49, tmp); _mm_storeu_si128((__m128i *)to + 50, tmp); _mm_storeu_si128((__m128i *)to + 51, tmp); _mm_storeu_si128((__m128i *)to + 52, tmp); _mm_storeu_si128((__m128i *)to + 53, tmp); _mm_storeu_si128((__m128i *)to + 54, tmp); _mm_storeu_si128((__m128i *)to + 55, tmp); _mm_storeu_si128((__m128i *)to + 56, tmp); _mm_storeu_si128((__m128i *)to + 57, tmp); _mm_storeu_si128((__m128i *)to + 58, tmp); _mm_storeu_si128((__m128i *)to + 59, tmp); _mm_storeu_si128((__m128i *)to + 60, tmp); _mm_storeu_si128((__m128i *)to + 61, tmp); _mm_storeu_si128((__m128i *)to + 62, tmp); _mm_storeu_si128((__m128i *)to + 63, tmp); to += 256; break; case 0x10: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x11: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x12: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x13: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x14: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x15: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x16: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x17: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x18: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x19: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x1a: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x1b: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x1c: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x1d: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x1e: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; case 0x1f: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); byte_stream = _mm_srli_epi64(byte_stream, 1); _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); in += 16; to += 128; break; case 0x20: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x21: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x22: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x23: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x24: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x25: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x26: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x27: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x28: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x29: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x2a: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x2b: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x2c: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x2d: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x2e: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; case 0x2f: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); byte_stream = _mm_srli_epi64(byte_stream, 2); _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); in += 16; to += 64; break; case 0x30: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x31: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x32: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x33: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x34: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x35: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x36: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x37: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x38: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x39: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x3a: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x3b: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x3c: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x3d: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x3e: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; case 0x3f: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); byte_stream = _mm_srli_epi64(byte_stream, 3); _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); in += 16; to += 40; break; case 0x40: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x41: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x42: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x43: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x44: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x45: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x46: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x47: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x48: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x49: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x4a: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x4b: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x4c: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x4d: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x4e: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; case 0x4f: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); byte_stream = _mm_srli_epi64(byte_stream, 4); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); in += 16; to += 32; break; case 0x50: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x51: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x52: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x53: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x54: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x55: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x56: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x57: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x58: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x59: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x5a: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x5b: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x5c: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x5d: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x5e: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; case 0x5f: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); byte_stream = _mm_srli_epi64(byte_stream, 5); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); in += 16; to += 24; break; case 0x60: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x61: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x62: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x63: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x64: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x65: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x66: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x67: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x68: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x69: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x6a: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x6b: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x6c: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x6d: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x6e: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; case 0x6f: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); byte_stream = _mm_srli_epi64(byte_stream, 6); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); in += 16; to += 20; break; case 0x70: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x71: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x72: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x73: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x74: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x75: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x76: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x77: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x78: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x79: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x7a: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x7b: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x7c: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x7d: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x7e: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; case 0x7f: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); byte_stream = _mm_srli_epi32(byte_stream_2, 3); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); byte_stream = _mm_srli_epi32(byte_stream, 7); _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); in += 32; to += 36; break; case 0x80: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x81: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x82: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x83: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x84: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x85: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x86: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x87: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x88: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x89: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x8a: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x8b: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x8c: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x8d: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x8e: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; case 0x8f: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); in += 16; to += 16; break; case 0x90: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x91: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x92: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x93: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x94: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x95: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x96: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x97: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x98: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x99: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x9a: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x9b: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x9c: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x9d: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x9e: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; case 0x9f: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); byte_stream = _mm_srli_epi32(byte_stream_2, 4); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); byte_stream = _mm_srli_epi32(byte_stream, 9); _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); in += 32; to += 28; break; case 0xa0: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xa1: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xa2: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xa3: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xa4: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xa5: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xa6: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xa7: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xa8: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xa9: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xaa: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xab: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xac: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xad: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xae: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; case 0xaf: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); byte_stream = _mm_srli_epi64(byte_stream, 10); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); in += 16; to += 12; break; case 0xb0: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xb1: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xb2: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xb3: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xb4: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xb5: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xb6: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xb7: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xb8: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xb9: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xba: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xbb: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xbc: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xbd: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xbe: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; case 0xbf: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); byte_stream = _mm_srli_epi32(byte_stream_2, 8); _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); byte_stream = _mm_srli_epi32(byte_stream, 12); _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); in += 32; to += 20; break; case 0xc0: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xc1: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xc2: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xc3: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xc4: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xc5: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xc6: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xc7: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xc8: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xc9: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xca: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xcb: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xcc: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xcd: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xce: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; case 0xcf: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); in += 16; to += 8; break; case 0xd0: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xd1: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xd2: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xd3: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xd4: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xd5: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xd6: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xd7: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xd8: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xd9: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xda: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xdb: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xdc: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xdd: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xde: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; case 0xdf: byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; to += 12; break; case 0xe0: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xe1: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xe2: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xe3: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xe4: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xe5: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xe6: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xe7: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xe8: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xe9: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xea: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xeb: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xec: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xed: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xee: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; case 0xef: tmp = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, tmp); in += 16; to += 4; break; case 0xf0: *to = *(uint8_t *)in; in += 1; to += 1; case 0xf1: *to = *(uint8_t *)in; in += 1; to += 1; case 0xf2: *to = *(uint8_t *)in; in += 1; to += 1; case 0xf3: *to = *(uint8_t *)in; in += 1; to += 1; break; case 0xf4: *to = *(uint16_t *)in; in += 2; to += 1; case 0xf5: *to = *(uint16_t *)in; in += 2; to += 1; case 0xf6: *to = *(uint16_t *)in; in += 2; to += 1; case 0xf7: *to = *(uint16_t *)in; in += 2; to += 1; break; case 0xf8: *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); in += 3; to += 1; case 0xf9: *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); in += 3; to += 1; case 0xfa: *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); in += 3; to += 1; case 0xfb: *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); in += 3; to += 1; break; case 0xfc: *to = *(uint32_t *)in; in += 4; to += 1; case 0xfd: *to = *(uint32_t *)in; in += 4; to += 1; case 0xfe: *to = *(uint32_t *)in; in += 4; to += 1; case 0xff: *to = *(uint32_t *)in; in += 4; to += 1; break; break; } } }