5451 lines
282 KiB
C++
5451 lines
282 KiB
C++
static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};
|
|
static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};
|
|
static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};
|
|
static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};
|
|
static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};
|
|
static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};
|
|
static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};
|
|
static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};
|
|
static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};
|
|
static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};
|
|
static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};
|
|
void ANT_compress_qmx::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)
|
|
{
|
|
__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;
|
|
uint8_t *in = (uint8_t *)source;
|
|
uint32_t *end = to + destination_integers;
|
|
uint32_t key_start = vbyte_decompress((uint8_t *)source + len - 1);
|
|
uint8_t *keys = (uint8_t *)source + len - key_start;
|
|
|
|
mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);
|
|
mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);
|
|
mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);
|
|
mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);
|
|
mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);
|
|
mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);
|
|
mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);
|
|
mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);
|
|
mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);
|
|
mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);
|
|
mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
|
|
while (to < end)
|
|
{
|
|
switch (*keys++)
|
|
{
|
|
case 0x00:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x01:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x02:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x03:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x04:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x05:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x06:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x07:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x08:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x09:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x0a:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x0b:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x0c:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x0d:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x0e:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
case 0x0f:
|
|
#ifdef NO_ZEROS
|
|
tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 63, tmp);
|
|
to += 256;
|
|
break;
|
|
case 0x10:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x11:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x12:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x13:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x14:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x15:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x16:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x17:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x18:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x19:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x1a:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x1b:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x1c:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x1d:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x1e:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
case 0x1f:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 1);
|
|
_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));
|
|
in += 16;
|
|
to += 128;
|
|
break;
|
|
case 0x20:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x21:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x22:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x23:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x24:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x25:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x26:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x27:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x28:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x29:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x2a:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x2b:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x2c:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x2d:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x2e:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
case 0x2f:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 2);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));
|
|
in += 16;
|
|
to += 64;
|
|
break;
|
|
case 0x30:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x31:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x32:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x33:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x34:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x35:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x36:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x37:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x38:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x39:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x3a:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x3b:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x3c:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x3d:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x3e:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
case 0x3f:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 3);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));
|
|
in += 16;
|
|
to += 40;
|
|
break;
|
|
case 0x40:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x41:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x42:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x43:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x44:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x45:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x46:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x47:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x48:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x49:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x4a:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x4b:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x4c:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x4d:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x4e:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
case 0x4f:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 4);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));
|
|
in += 16;
|
|
to += 32;
|
|
break;
|
|
case 0x50:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x51:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x52:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x53:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x54:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x55:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x56:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x57:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x58:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x59:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x5a:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x5b:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x5c:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x5d:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x5e:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
case 0x5f:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 5);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));
|
|
in += 16;
|
|
to += 24;
|
|
break;
|
|
case 0x60:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x61:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x62:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x63:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x64:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x65:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x66:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x67:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x68:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x69:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x6a:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x6b:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x6c:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x6d:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x6e:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
case 0x6f:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 6);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));
|
|
in += 16;
|
|
to += 20;
|
|
break;
|
|
case 0x70:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x71:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x72:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x73:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x74:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x75:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x76:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x77:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x78:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x79:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x7a:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x7b:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x7c:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x7d:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x7e:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
case 0x7f:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 3);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 7);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));
|
|
in += 32;
|
|
to += 36;
|
|
break;
|
|
case 0x80:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x81:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x82:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x83:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x84:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x85:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x86:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x87:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x88:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x89:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x8a:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x8b:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x8c:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x8d:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x8e:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
case 0x8f:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));
|
|
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));
|
|
tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));
|
|
in += 16;
|
|
to += 16;
|
|
break;
|
|
case 0x90:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x91:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x92:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x93:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x94:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x95:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x96:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x97:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x98:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x99:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x9a:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x9b:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x9c:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x9d:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x9e:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
case 0x9f:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 4);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 9);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));
|
|
in += 32;
|
|
to += 28;
|
|
break;
|
|
case 0xa0:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xa1:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xa2:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xa3:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xa4:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xa5:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xa6:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xa7:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xa8:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xa9:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xaa:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xab:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xac:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xad:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xae:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
case 0xaf:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));
|
|
byte_stream = _mm_srli_epi64(byte_stream, 10);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));
|
|
in += 16;
|
|
to += 12;
|
|
break;
|
|
case 0xb0:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xb1:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xb2:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xb3:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xb4:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xb5:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xb6:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xb7:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xb8:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xb9:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xba:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xbb:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xbc:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xbd:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xbe:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
case 0xbf:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream_2, 8);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));
|
|
byte_stream = _mm_srli_epi32(byte_stream, 12);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));
|
|
in += 32;
|
|
to += 20;
|
|
break;
|
|
case 0xc0:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xc1:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xc2:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xc3:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xc4:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xc5:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xc6:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xc7:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xc8:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xc9:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xca:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xcb:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xcc:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xcd:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xce:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
case 0xcf:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
in += 16;
|
|
to += 8;
|
|
break;
|
|
case 0xd0:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xd1:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xd2:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xd3:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xd4:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xd5:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xd6:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xd7:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xd8:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xd9:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xda:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xdb:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xdc:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xdd:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xde:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
case 0xdf:
|
|
byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));
|
|
byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);
|
|
_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
in += 32;
|
|
to += 12;
|
|
break;
|
|
case 0xe0:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xe1:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xe2:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xe3:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xe4:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xe5:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xe6:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xe7:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xe8:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xe9:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xea:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xeb:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xec:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xed:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xee:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
case 0xef:
|
|
tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_storeu_si128((__m128i *)to, tmp);
|
|
in += 16;
|
|
to += 4;
|
|
break;
|
|
case 0xf0:
|
|
*to = *(uint8_t *)in;
|
|
in += 1;
|
|
to += 1;
|
|
case 0xf1:
|
|
*to = *(uint8_t *)in;
|
|
in += 1;
|
|
to += 1;
|
|
case 0xf2:
|
|
*to = *(uint8_t *)in;
|
|
in += 1;
|
|
to += 1;
|
|
case 0xf3:
|
|
*to = *(uint8_t *)in;
|
|
in += 1;
|
|
to += 1;
|
|
break;
|
|
case 0xf4:
|
|
*to = *(uint16_t *)in;
|
|
in += 2;
|
|
to += 1;
|
|
case 0xf5:
|
|
*to = *(uint16_t *)in;
|
|
in += 2;
|
|
to += 1;
|
|
case 0xf6:
|
|
*to = *(uint16_t *)in;
|
|
in += 2;
|
|
to += 1;
|
|
case 0xf7:
|
|
*to = *(uint16_t *)in;
|
|
in += 2;
|
|
to += 1;
|
|
break;
|
|
case 0xf8:
|
|
*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));
|
|
in += 3;
|
|
to += 1;
|
|
case 0xf9:
|
|
*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));
|
|
in += 3;
|
|
to += 1;
|
|
case 0xfa:
|
|
*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));
|
|
in += 3;
|
|
to += 1;
|
|
case 0xfb:
|
|
*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));
|
|
in += 3;
|
|
to += 1;
|
|
break;
|
|
case 0xfc:
|
|
*to = *(uint32_t *)in;
|
|
in += 4;
|
|
to += 1;
|
|
case 0xfd:
|
|
*to = *(uint32_t *)in;
|
|
in += 4;
|
|
to += 1;
|
|
case 0xfe:
|
|
*to = *(uint32_t *)in;
|
|
in += 4;
|
|
to += 1;
|
|
case 0xff:
|
|
*to = *(uint32_t *)in;
|
|
in += 4;
|
|
to += 1;
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
}
|