Files
TurboPFor-Integer-Compression/ext/bench_/bench/compress_qmx_v2_decompress.cpp
2017-01-02 23:30:16 +01:00

5449 lines
282 KiB
C++

static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};
static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};
static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};
static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};
static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};
static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};
static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};
static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};
static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};
static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};
static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};
void ANT_compress_qmx_v2::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)
{
__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;
uint8_t *in = (uint8_t *)source;
uint8_t *keys = ((uint8_t *)source) + len - 1;
mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);
mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);
mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);
mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);
mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);
mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);
mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);
mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);
mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);
mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);
mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);
while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers
{
switch (*keys--)
{
case 0x00:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x01:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x02:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x03:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x04:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x05:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x06:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x07:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x08:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x09:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x0a:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x0b:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x0c:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x0d:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x0e:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
case 0x0f:
#ifdef NO_ZEROS
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
#endif
_mm_storeu_si128((__m128i *)to, tmp);
_mm_storeu_si128((__m128i *)to + 1, tmp);
_mm_storeu_si128((__m128i *)to + 2, tmp);
_mm_storeu_si128((__m128i *)to + 3, tmp);
_mm_storeu_si128((__m128i *)to + 4, tmp);
_mm_storeu_si128((__m128i *)to + 5, tmp);
_mm_storeu_si128((__m128i *)to + 6, tmp);
_mm_storeu_si128((__m128i *)to + 7, tmp);
_mm_storeu_si128((__m128i *)to + 8, tmp);
_mm_storeu_si128((__m128i *)to + 9, tmp);
_mm_storeu_si128((__m128i *)to + 10, tmp);
_mm_storeu_si128((__m128i *)to + 11, tmp);
_mm_storeu_si128((__m128i *)to + 12, tmp);
_mm_storeu_si128((__m128i *)to + 13, tmp);
_mm_storeu_si128((__m128i *)to + 14, tmp);
_mm_storeu_si128((__m128i *)to + 15, tmp);
_mm_storeu_si128((__m128i *)to + 16, tmp);
_mm_storeu_si128((__m128i *)to + 17, tmp);
_mm_storeu_si128((__m128i *)to + 18, tmp);
_mm_storeu_si128((__m128i *)to + 19, tmp);
_mm_storeu_si128((__m128i *)to + 20, tmp);
_mm_storeu_si128((__m128i *)to + 21, tmp);
_mm_storeu_si128((__m128i *)to + 22, tmp);
_mm_storeu_si128((__m128i *)to + 23, tmp);
_mm_storeu_si128((__m128i *)to + 24, tmp);
_mm_storeu_si128((__m128i *)to + 25, tmp);
_mm_storeu_si128((__m128i *)to + 26, tmp);
_mm_storeu_si128((__m128i *)to + 27, tmp);
_mm_storeu_si128((__m128i *)to + 28, tmp);
_mm_storeu_si128((__m128i *)to + 29, tmp);
_mm_storeu_si128((__m128i *)to + 30, tmp);
_mm_storeu_si128((__m128i *)to + 31, tmp);
_mm_storeu_si128((__m128i *)to + 32, tmp);
_mm_storeu_si128((__m128i *)to + 33, tmp);
_mm_storeu_si128((__m128i *)to + 34, tmp);
_mm_storeu_si128((__m128i *)to + 35, tmp);
_mm_storeu_si128((__m128i *)to + 36, tmp);
_mm_storeu_si128((__m128i *)to + 37, tmp);
_mm_storeu_si128((__m128i *)to + 38, tmp);
_mm_storeu_si128((__m128i *)to + 39, tmp);
_mm_storeu_si128((__m128i *)to + 40, tmp);
_mm_storeu_si128((__m128i *)to + 41, tmp);
_mm_storeu_si128((__m128i *)to + 42, tmp);
_mm_storeu_si128((__m128i *)to + 43, tmp);
_mm_storeu_si128((__m128i *)to + 44, tmp);
_mm_storeu_si128((__m128i *)to + 45, tmp);
_mm_storeu_si128((__m128i *)to + 46, tmp);
_mm_storeu_si128((__m128i *)to + 47, tmp);
_mm_storeu_si128((__m128i *)to + 48, tmp);
_mm_storeu_si128((__m128i *)to + 49, tmp);
_mm_storeu_si128((__m128i *)to + 50, tmp);
_mm_storeu_si128((__m128i *)to + 51, tmp);
_mm_storeu_si128((__m128i *)to + 52, tmp);
_mm_storeu_si128((__m128i *)to + 53, tmp);
_mm_storeu_si128((__m128i *)to + 54, tmp);
_mm_storeu_si128((__m128i *)to + 55, tmp);
_mm_storeu_si128((__m128i *)to + 56, tmp);
_mm_storeu_si128((__m128i *)to + 57, tmp);
_mm_storeu_si128((__m128i *)to + 58, tmp);
_mm_storeu_si128((__m128i *)to + 59, tmp);
_mm_storeu_si128((__m128i *)to + 60, tmp);
_mm_storeu_si128((__m128i *)to + 61, tmp);
_mm_storeu_si128((__m128i *)to + 62, tmp);
_mm_storeu_si128((__m128i *)to + 63, tmp);
to += 256;
break;
case 0x10:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x11:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x12:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x13:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x14:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x15:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x16:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x17:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x18:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x19:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x1a:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x1b:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x1c:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x1d:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x1e:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
case 0x1f:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
byte_stream = _mm_srli_epi64(byte_stream, 1);
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
in += 16;
to += 128;
break;
case 0x20:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x21:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x22:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x23:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x24:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x25:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x26:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x27:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x28:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x29:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x2a:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x2b:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x2c:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x2d:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x2e:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
case 0x2f:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
byte_stream = _mm_srli_epi64(byte_stream, 2);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
in += 16;
to += 64;
break;
case 0x30:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x31:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x32:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x33:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x34:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x35:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x36:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x37:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x38:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x39:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x3a:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x3b:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x3c:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x3d:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x3e:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
case 0x3f:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
byte_stream = _mm_srli_epi64(byte_stream, 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
in += 16;
to += 40;
break;
case 0x40:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x41:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x42:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x43:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x44:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x45:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x46:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x47:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x48:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x49:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x4a:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x4b:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x4c:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x4d:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x4e:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
case 0x4f:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
byte_stream = _mm_srli_epi64(byte_stream, 4);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
in += 16;
to += 32;
break;
case 0x50:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x51:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x52:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x53:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x54:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x55:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x56:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x57:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x58:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x59:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x5a:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x5b:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x5c:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x5d:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x5e:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
case 0x5f:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
byte_stream = _mm_srli_epi64(byte_stream, 5);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
in += 16;
to += 24;
break;
case 0x60:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x61:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x62:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x63:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x64:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x65:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x66:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x67:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x68:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x69:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x6a:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x6b:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x6c:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x6d:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x6e:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
case 0x6f:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
byte_stream = _mm_srli_epi64(byte_stream, 6);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
in += 16;
to += 20;
break;
case 0x70:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x71:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x72:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x73:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x74:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x75:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x76:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x77:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x78:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x79:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x7a:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x7b:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x7c:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x7d:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x7e:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
case 0x7f:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
byte_stream = _mm_srli_epi32(byte_stream, 7);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
in += 32;
to += 36;
break;
case 0x80:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x81:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x82:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x83:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x84:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x85:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x86:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x87:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x88:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x89:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x8a:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x8b:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x8c:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x8d:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x8e:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
case 0x8f:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
in += 16;
to += 16;
break;
case 0x90:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x91:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x92:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x93:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x94:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x95:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x96:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x97:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x98:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x99:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x9a:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x9b:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x9c:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x9d:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x9e:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
case 0x9f:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
byte_stream = _mm_srli_epi32(byte_stream, 9);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
in += 32;
to += 28;
break;
case 0xa0:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xa1:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xa2:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xa3:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xa4:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xa5:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xa6:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xa7:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xa8:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xa9:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xaa:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xab:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xac:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xad:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xae:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
case 0xaf:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
byte_stream = _mm_srli_epi64(byte_stream, 10);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
in += 16;
to += 12;
break;
case 0xb0:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xb1:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xb2:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xb3:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xb4:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xb5:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xb6:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xb7:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xb8:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xb9:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xba:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xbb:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xbc:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xbd:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xbe:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
case 0xbf:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
byte_stream = _mm_srli_epi32(byte_stream, 12);
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
in += 32;
to += 20;
break;
case 0xc0:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xc1:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xc2:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xc3:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xc4:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xc5:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xc6:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xc7:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xc8:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xc9:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xca:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xcb:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xcc:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xcd:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xce:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
case 0xcf:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
in += 16;
to += 8;
break;
case 0xd0:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xd1:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xd2:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xd3:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xd4:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xd5:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xd6:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xd7:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xd8:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xd9:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xda:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xdb:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xdc:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xdd:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xde:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
case 0xdf:
byte_stream = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
in += 32;
to += 12;
break;
case 0xe0:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xe1:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xe2:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xe3:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xe4:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xe5:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xe6:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xe7:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xe8:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xe9:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xea:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xeb:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xec:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xed:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xee:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
case 0xef:
tmp = _mm_loadu_si128((__m128i *)in);
_mm_storeu_si128((__m128i *)to, tmp);
in += 16;
to += 4;
break;
case 0xf0:
*to = *(uint8_t *)in;
in += 1;
to += 1;
case 0xf1:
*to = *(uint8_t *)in;
in += 1;
to += 1;
case 0xf2:
*to = *(uint8_t *)in;
in += 1;
to += 1;
case 0xf3:
*to = *(uint8_t *)in;
in += 1;
to += 1;
break;
case 0xf4:
*to = *(uint16_t *)in;
in += 2;
to += 1;
case 0xf5:
*to = *(uint16_t *)in;
in += 2;
to += 1;
case 0xf6:
*to = *(uint16_t *)in;
in += 2;
to += 1;
case 0xf7:
*to = *(uint16_t *)in;
in += 2;
to += 1;
break;
case 0xf8:
*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));
in += 3;
to += 1;
case 0xf9:
*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));
in += 3;
to += 1;
case 0xfa:
*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));
in += 3;
to += 1;
case 0xfb:
*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));
in += 3;
to += 1;
break;
case 0xfc:
*to = *(uint32_t *)in;
in += 4;
to += 1;
case 0xfd:
*to = *(uint32_t *)in;
in += 4;
to += 1;
case 0xfe:
*to = *(uint32_t *)in;
in += 4;
to += 1;
case 0xff:
*to = *(uint32_t *)in;
in += 4;
to += 1;
break;
break;
}
}
}