Files
TurboPFor-Integer-Compression/ext/bench_/bench/compress_qmx_v3_decompress.cpp
2017-01-02 23:30:16 +01:00

33909 lines
2.1 MiB

static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};
static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};
static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};
static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};
static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};
static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};
static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};
static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};
static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};
static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};
static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};
void ANT_compress_qmx_v3::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)
{
__m128i mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;
uint8_t *in = (uint8_t *)source;
uint8_t *keys = ((uint8_t *)source) + len - 1;
mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);
mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);
mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);
mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);
mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);
mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);
mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);
mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);
mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);
mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);
mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);
while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers
{
switch (*keys--)
{
case 0x00:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 448, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 512, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 576, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 640, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 704, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 768, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 832, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 896, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 960, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 960 + 63, tmp);
to += 4096;
break;
}
case 0x01:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 448, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 512, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 576, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 640, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 704, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 768, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 832, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 896, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 896 + 63, tmp);
to += 3840;
break;
}
case 0x02:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 448, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 512, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 576, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 640, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 704, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 768, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 832, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 832 + 63, tmp);
to += 3584;
break;
}
case 0x03:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 448, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 512, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 576, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 640, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 704, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 768, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 768 + 63, tmp);
to += 3328;
break;
}
case 0x04:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 448, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 512, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 576, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 640, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 704, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
to += 3072;
break;
}
case 0x05:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 448, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 512, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 576, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 640, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
to += 2816;
break;
}
case 0x06:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 448, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 512, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 576, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
to += 2560;
break;
}
case 0x07:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 448, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 512, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
to += 2304;
break;
}
case 0x08:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 448, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
to += 2048;
break;
}
case 0x09:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 384, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
to += 1792;
break;
}
case 0x0a:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 320, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
to += 1536;
break;
}
case 0x0b:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 256, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
to += 1280;
break;
}
case 0x0c:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 192, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
to += 1024;
break;
}
case 0x0d:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 128, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
to += 768;
break;
}
case 0x0e:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
_mm_storeu_si128((__m128i *)to + 64, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
to += 512;
break;
}
case 0x0f:
{
#ifdef NO_ZEROS
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
#else
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
#endif
_mm_storeu_si128((__m128i *)to + 0, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
to += 256;
break;
}
case 0x10:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 480, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 480 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 256;
to += 2048;
break;
}
case 0x11:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 240;
to += 1920;
break;
}
case 0x12:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 224;
to += 1792;
break;
}
case 0x13:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 208;
to += 1664;
break;
}
case 0x14:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 192;
to += 1536;
break;
}
case 0x15:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 176;
to += 1408;
break;
}
case 0x16:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 160;
to += 1280;
break;
}
case 0x17:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 144;
to += 1152;
break;
}
case 0x18:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 128;
to += 1024;
break;
}
case 0x19:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 112;
to += 896;
break;
}
case 0x1a:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 96;
to += 768;
break;
}
case 0x1b:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 80;
to += 640;
break;
}
case 0x1c:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 64;
to += 512;
break;
}
case 0x1d:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 48;
to += 384;
break;
}
case 0x1e:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 32;
to += 256;
break;
}
case 0x1f:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
}
in += 16;
to += 128;
break;
}
case 0x20:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 240, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 240 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 256;
to += 1024;
break;
}
case 0x21:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 240;
to += 960;
break;
}
case 0x22:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 224;
to += 896;
break;
}
case 0x23:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 208;
to += 832;
break;
}
case 0x24:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 192;
to += 768;
break;
}
case 0x25:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 176;
to += 704;
break;
}
case 0x26:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 160;
to += 640;
break;
}
case 0x27:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 144;
to += 576;
break;
}
case 0x28:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 128;
to += 512;
break;
}
case 0x29:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 112;
to += 448;
break;
}
case 0x2a:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 96;
to += 384;
break;
}
case 0x2b:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 80;
to += 320;
break;
}
case 0x2c:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 64;
to += 256;
break;
}
case 0x2d:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 48;
to += 192;
break;
}
case 0x2e:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 32;
to += 128;
break;
}
case 0x2f:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
}
in += 16;
to += 64;
break;
}
case 0x30:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 150, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 150 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 150 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 150 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 150 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 150 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 150 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 150 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 150 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 150 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 256;
to += 640;
break;
}
case 0x31:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 240;
to += 600;
break;
}
case 0x32:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 224;
to += 560;
break;
}
case 0x33:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 208;
to += 520;
break;
}
case 0x34:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 192;
to += 480;
break;
}
case 0x35:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 176;
to += 440;
break;
}
case 0x36:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 160;
to += 400;
break;
}
case 0x37:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 144;
to += 360;
break;
}
case 0x38:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 128;
to += 320;
break;
}
case 0x39:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 112;
to += 280;
break;
}
case 0x3a:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 96;
to += 240;
break;
}
case 0x3b:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 80;
to += 200;
break;
}
case 0x3c:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 64;
to += 160;
break;
}
case 0x3d:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 48;
to += 120;
break;
}
case 0x3e:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 32;
to += 80;
break;
}
case 0x3f:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
}
in += 16;
to += 40;
break;
}
case 0x40:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 256;
to += 512;
break;
}
case 0x41:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 240;
to += 480;
break;
}
case 0x42:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 224;
to += 448;
break;
}
case 0x43:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 208;
to += 416;
break;
}
case 0x44:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 192;
to += 384;
break;
}
case 0x45:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 176;
to += 352;
break;
}
case 0x46:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 160;
to += 320;
break;
}
case 0x47:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 144;
to += 288;
break;
}
case 0x48:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 128;
to += 256;
break;
}
case 0x49:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 112;
to += 224;
break;
}
case 0x4a:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 96;
to += 192;
break;
}
case 0x4b:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 80;
to += 160;
break;
}
case 0x4c:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 64;
to += 128;
break;
}
case 0x4d:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 48;
to += 96;
break;
}
case 0x4e:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 32;
to += 64;
break;
}
case 0x4f:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
}
in += 16;
to += 32;
break;
}
case 0x50:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 256;
to += 384;
break;
}
case 0x51:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 240;
to += 360;
break;
}
case 0x52:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 224;
to += 336;
break;
}
case 0x53:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 208;
to += 312;
break;
}
case 0x54:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 192;
to += 288;
break;
}
case 0x55:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 176;
to += 264;
break;
}
case 0x56:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 160;
to += 240;
break;
}
case 0x57:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 144;
to += 216;
break;
}
case 0x58:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 128;
to += 192;
break;
}
case 0x59:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 112;
to += 168;
break;
}
case 0x5a:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 96;
to += 144;
break;
}
case 0x5b:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 80;
to += 120;
break;
}
case 0x5c:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 64;
to += 96;
break;
}
case 0x5d:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 48;
to += 72;
break;
}
case 0x5e:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 32;
to += 48;
break;
}
case 0x5f:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
}
in += 16;
to += 24;
break;
}
case 0x60:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 256;
to += 320;
break;
}
case 0x61:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 240;
to += 300;
break;
}
case 0x62:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 224;
to += 280;
break;
}
case 0x63:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 208;
to += 260;
break;
}
case 0x64:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 192;
to += 240;
break;
}
case 0x65:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 176;
to += 220;
break;
}
case 0x66:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 160;
to += 200;
break;
}
case 0x67:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 144;
to += 180;
break;
}
case 0x68:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 128;
to += 160;
break;
}
case 0x69:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 112;
to += 140;
break;
}
case 0x6a:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 96;
to += 120;
break;
}
case 0x6b:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 80;
to += 100;
break;
}
case 0x6c:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 64;
to += 80;
break;
}
case 0x6d:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 48;
to += 60;
break;
}
case 0x6e:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 32;
to += 40;
break;
}
case 0x6f:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
}
in += 16;
to += 20;
break;
}
case 0x70:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28);
_mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29);
_mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30);
_mm_storeu_si128((__m128i *)to + 135, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 135 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 135 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 135 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31);
_mm_storeu_si128((__m128i *)to + 135 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 135 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 135 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 135 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 135 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 512;
to += 576;
break;
}
case 0x71:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28);
_mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29);
_mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 480;
to += 540;
break;
}
case 0x72:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 448;
to += 504;
break;
}
case 0x73:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 416;
to += 468;
break;
}
case 0x74:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 384;
to += 432;
break;
}
case 0x75:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 352;
to += 396;
break;
}
case 0x76:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 320;
to += 360;
break;
}
case 0x77:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 288;
to += 324;
break;
}
case 0x78:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 256;
to += 288;
break;
}
case 0x79:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 224;
to += 252;
break;
}
case 0x7a:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 192;
to += 216;
break;
}
case 0x7b:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 160;
to += 180;
break;
}
case 0x7c:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 128;
to += 144;
break;
}
case 0x7d:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 96;
to += 108;
break;
}
case 0x7e:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 64;
to += 72;
break;
}
case 0x7f:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
}
in += 32;
to += 36;
break;
}
case 0x80:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 60, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 256;
to += 256;
break;
}
case 0x81:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 240;
to += 240;
break;
}
case 0x82:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 224;
to += 224;
break;
}
case 0x83:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 208;
to += 208;
break;
}
case 0x84:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 192;
to += 192;
break;
}
case 0x85:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 176;
to += 176;
break;
}
case 0x86:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 160;
to += 160;
break;
}
case 0x87:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 144;
to += 144;
break;
}
case 0x88:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 128;
to += 128;
break;
}
case 0x89:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 112;
to += 112;
break;
}
case 0x8a:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 96;
to += 96;
break;
}
case 0x8b:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 80;
to += 80;
break;
}
case 0x8c:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 64;
to += 64;
break;
}
case 0x8d:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 48;
to += 48;
break;
}
case 0x8e:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 32;
to += 32;
break;
}
case 0x8f:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
}
in += 16;
to += 16;
break;
}
case 0x90:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28);
_mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29);
_mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30);
_mm_storeu_si128((__m128i *)to + 105, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 105 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 105 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31);
_mm_storeu_si128((__m128i *)to + 105 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 105 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 105 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 105 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 512;
to += 448;
break;
}
case 0x91:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28);
_mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29);
_mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 480;
to += 420;
break;
}
case 0x92:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 448;
to += 392;
break;
}
case 0x93:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 416;
to += 364;
break;
}
case 0x94:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 384;
to += 336;
break;
}
case 0x95:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 352;
to += 308;
break;
}
case 0x96:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 320;
to += 280;
break;
}
case 0x97:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 288;
to += 252;
break;
}
case 0x98:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 256;
to += 224;
break;
}
case 0x99:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 224;
to += 196;
break;
}
case 0x9a:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 192;
to += 168;
break;
}
case 0x9b:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 160;
to += 140;
break;
}
case 0x9c:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 128;
to += 112;
break;
}
case 0x9d:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 96;
to += 84;
break;
}
case 0x9e:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 64;
to += 56;
break;
}
case 0x9f:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
}
in += 32;
to += 28;
break;
}
case 0xa0:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 256;
to += 192;
break;
}
case 0xa1:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 240;
to += 180;
break;
}
case 0xa2:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 224;
to += 168;
break;
}
case 0xa3:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 208;
to += 156;
break;
}
case 0xa4:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 192;
to += 144;
break;
}
case 0xa5:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 176;
to += 132;
break;
}
case 0xa6:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 160;
to += 120;
break;
}
case 0xa7:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 144;
to += 108;
break;
}
case 0xa8:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 128;
to += 96;
break;
}
case 0xa9:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 112;
to += 84;
break;
}
case 0xaa:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 96;
to += 72;
break;
}
case 0xab:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 80;
to += 60;
break;
}
case 0xac:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 64;
to += 48;
break;
}
case 0xad:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 48;
to += 36;
break;
}
case 0xae:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 32;
to += 24;
break;
}
case 0xaf:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
}
in += 16;
to += 12;
break;
}
case 0xb0:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29);
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30);
_mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31);
_mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 512;
to += 320;
break;
}
case 0xb1:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28);
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29);
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 480;
to += 300;
break;
}
case 0xb2:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 448;
to += 280;
break;
}
case 0xb3:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 416;
to += 260;
break;
}
case 0xb4:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 384;
to += 240;
break;
}
case 0xb5:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 352;
to += 220;
break;
}
case 0xb6:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 320;
to += 200;
break;
}
case 0xb7:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 288;
to += 180;
break;
}
case 0xb8:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 256;
to += 160;
break;
}
case 0xb9:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 224;
to += 140;
break;
}
case 0xba:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 192;
to += 120;
break;
}
case 0xbb:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 160;
to += 100;
break;
}
case 0xbc:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 128;
to += 80;
break;
}
case 0xbd:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 96;
to += 60;
break;
}
case 0xbe:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 64;
to += 40;
break;
}
case 0xbf:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
}
in += 32;
to += 20;
break;
}
case 0xc0:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 30, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 256;
to += 128;
break;
}
case 0xc1:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 240;
to += 120;
break;
}
case 0xc2:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 224;
to += 112;
break;
}
case 0xc3:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 208;
to += 104;
break;
}
case 0xc4:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 192;
to += 96;
break;
}
case 0xc5:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 176;
to += 88;
break;
}
case 0xc6:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 160;
to += 80;
break;
}
case 0xc7:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 144;
to += 72;
break;
}
case 0xc8:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 128;
to += 64;
break;
}
case 0xc9:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 112;
to += 56;
break;
}
case 0xca:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 96;
to += 48;
break;
}
case 0xcb:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 80;
to += 40;
break;
}
case 0xcc:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 64;
to += 32;
break;
}
case 0xcd:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 48;
to += 24;
break;
}
case 0xce:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 32;
to += 16;
break;
}
case 0xcf:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
}
in += 16;
to += 8;
break;
}
case 0xd0:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29);
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30);
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31);
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 512;
to += 192;
break;
}
case 0xd1:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28);
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29);
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 480;
to += 180;
break;
}
case 0xd2:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26);
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27);
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 448;
to += 168;
break;
}
case 0xd3:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24);
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25);
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 416;
to += 156;
break;
}
case 0xd4:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22);
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23);
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 384;
to += 144;
break;
}
case 0xd5:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20);
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21);
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 352;
to += 132;
break;
}
case 0xd6:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18);
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19);
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 320;
to += 120;
break;
}
case 0xd7:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16);
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17);
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 288;
to += 108;
break;
}
case 0xd8:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 256;
to += 96;
break;
}
case 0xd9:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 224;
to += 84;
break;
}
case 0xda:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 192;
to += 72;
break;
}
case 0xdb:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 160;
to += 60;
break;
}
case 0xdc:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 128;
to += 48;
break;
}
case 0xdd:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 96;
to += 36;
break;
}
case 0xde:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 64;
to += 24;
break;
}
case 0xdf:
{
{
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
}
in += 32;
to += 12;
break;
}
case 0xe0:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 7, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 8, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 9, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 10, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 11, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 12, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 13, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 14, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 15);
_mm_storeu_si128((__m128i *)to + 15, tmp);
}
in += 256;
to += 64;
break;
}
case 0xe1:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 7, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 8, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 9, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 10, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 11, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 12, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 13, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14);
_mm_storeu_si128((__m128i *)to + 14, tmp);
}
in += 240;
to += 60;
break;
}
case 0xe2:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 7, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 8, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 9, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 10, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 11, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 12, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13);
_mm_storeu_si128((__m128i *)to + 13, tmp);
}
in += 224;
to += 56;
break;
}
case 0xe3:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 7, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 8, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 9, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 10, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 11, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12);
_mm_storeu_si128((__m128i *)to + 12, tmp);
}
in += 208;
to += 52;
break;
}
case 0xe4:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 7, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 8, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 9, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 10, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11);
_mm_storeu_si128((__m128i *)to + 11, tmp);
}
in += 192;
to += 48;
break;
}
case 0xe5:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 7, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 8, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 9, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10);
_mm_storeu_si128((__m128i *)to + 10, tmp);
}
in += 176;
to += 44;
break;
}
case 0xe6:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 7, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 8, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9);
_mm_storeu_si128((__m128i *)to + 9, tmp);
}
in += 160;
to += 40;
break;
}
case 0xe7:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 7, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8);
_mm_storeu_si128((__m128i *)to + 8, tmp);
}
in += 144;
to += 36;
break;
}
case 0xe8:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7);
_mm_storeu_si128((__m128i *)to + 7, tmp);
}
in += 128;
to += 32;
break;
}
case 0xe9:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6);
_mm_storeu_si128((__m128i *)to + 6, tmp);
}
in += 112;
to += 28;
break;
}
case 0xea:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5);
_mm_storeu_si128((__m128i *)to + 5, tmp);
}
in += 96;
to += 24;
break;
}
case 0xeb:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4);
_mm_storeu_si128((__m128i *)to + 4, tmp);
}
in += 80;
to += 20;
break;
}
case 0xec:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3);
_mm_storeu_si128((__m128i *)to + 3, tmp);
}
in += 64;
to += 16;
break;
}
case 0xed:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2);
_mm_storeu_si128((__m128i *)to + 2, tmp);
}
in += 48;
to += 12;
break;
}
case 0xee:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1);
_mm_storeu_si128((__m128i *)to + 1, tmp);
}
in += 32;
to += 8;
break;
}
case 0xef:
{
{
const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0);
_mm_storeu_si128((__m128i *)to + 0, tmp);
}
in += 16;
to += 4;
break;
}
case 0xf0:
{
*(to + 0) = *(uint8_t *)(in + 0);
*(to + 1) = *(uint8_t *)(in + 1);
*(to + 2) = *(uint8_t *)(in + 2);
*(to + 3) = *(uint8_t *)(in + 3);
in += 4;
to += 4;
break;
}
case 0xf1:
{
*(to + 0) = *(uint8_t *)(in + 0);
*(to + 1) = *(uint8_t *)(in + 1);
*(to + 2) = *(uint8_t *)(in + 2);
in += 3;
to += 3;
break;
}
case 0xf2:
{
*(to + 0) = *(uint8_t *)(in + 0);
*(to + 1) = *(uint8_t *)(in + 1);
in += 2;
to += 2;
break;
}
case 0xf3:
{
*(to + 0) = *(uint8_t *)(in + 0);
in += 1;
to += 1;
break;
}
case 0xf4:
{
*(to + 0) = *(uint16_t *)(in + 2 * 0);
*(to + 1) = *(uint16_t *)(in + 2 * 1);
*(to + 2) = *(uint16_t *)(in + 2 * 2);
*(to + 3) = *(uint16_t *)(in + 2 * 3);
in += 2 * 4;
to += 4;
break;
}
case 0xf5:
{
*(to + 0) = *(uint16_t *)(in + 2 * 0);
*(to + 1) = *(uint16_t *)(in + 2 * 1);
*(to + 2) = *(uint16_t *)(in + 2 * 2);
in += 2 * 3;
to += 3;
break;
}
case 0xf6:
{
*(to + 0) = *(uint16_t *)(in + 2 * 0);
*(to + 1) = *(uint16_t *)(in + 2 * 1);
in += 2 * 2;
to += 2;
break;
}
case 0xf7:
{
*(to + 0) = *(uint16_t *)(in + 2 * 0);
in += 2 * 1;
to += 1;
break;
}
case 0xf8:
{
*(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2));
*(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2));
*(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2));
*(to + 3) = (*(uint8_t *)(in + 3 * 3) << 16) | (*(uint8_t *)(in + 3 * 3 + 1) << 8) | (*(uint8_t *)(in + 3 * 3 + 2));
in += 3 * 4;
to += 4;
break;
}
case 0xf9:
{
*(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2));
*(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2));
*(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2));
in += 3 * 3;
to += 3;
break;
}
case 0xfa:
{
*(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2));
*(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2));
in += 3 * 2;
to += 2;
break;
}
case 0xfb:
{
*(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2));
in += 3 * 1;
to += 1;
break;
}
case 0xfc:
{
*(to + 0) = *(uint32_t *)(in + 4 * 0);
*(to + 1) = *(uint32_t *)(in + 4 * 1);
*(to + 2) = *(uint32_t *)(in + 4 * 2);
*(to + 3) = *(uint32_t *)(in + 4 * 3);
in += 4 * 4;
to += 4;
break;
}
case 0xfd:
{
*(to + 0) = *(uint32_t *)(in + 4 * 0);
*(to + 1) = *(uint32_t *)(in + 4 * 1);
*(to + 2) = *(uint32_t *)(in + 4 * 2);
in += 4 * 3;
to += 3;
break;
}
case 0xfe:
{
*(to + 0) = *(uint32_t *)(in + 4 * 0);
*(to + 1) = *(uint32_t *)(in + 4 * 1);
in += 4 * 2;
to += 2;
break;
}
case 0xff:
{
*(to + 0) = *(uint32_t *)(in + 4 * 0);
in += 4 * 1;
to += 1;
break;
}
}
}
}